<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>

    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <meta content="Cask Data, Inc." name="author" />
<meta content="Copyright © 2017 Cask Data, Inc." name="copyright" />
<meta content="The CDAP User Guide: Getting Started" name="description" />


    <meta name="git_release" content="6.1.1">
    <meta name="git_hash" content="05fbac36f9f7aadeb44f5728cea35136dbc243e5">
    <meta name="git_timestamp" content="2020-02-09 08:22:47 +0800">
    <title>示例: 使用纽约时报 XML 数据推送</title>

    <link rel="stylesheet" href="../_static/cdap-bootstrap.css" type="text/css" />
    <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
    <link rel="stylesheet" href="../_static/bootstrap-3.3.6/css/bootstrap.min.css" type="text/css" />
    <link rel="stylesheet" href="../_static/bootstrap-3.3.6/css/bootstrap-theme.min.css" type="text/css" />
    <link rel="stylesheet" href="../_static/css/bootstrap-sphinx.css" type="text/css" />
    <link rel="stylesheet" href="../_static/css/cdap-dynamicscrollspy-4.css" type="text/css" />
    <link rel="stylesheet" href="../_static/css/jquery.mCustomScrollbar.css" type="text/css" />
    <link rel="stylesheet" href="../_static/css/cdap-jquery.mCustomScrollbar.css" type="text/css" />
    <link rel="stylesheet" href="../_static/css/abixTreeList-2.css" type="text/css" />
    <link rel="stylesheet" href="../_static/cdap-bootstrap.css" type="text/css" />
    <link rel="stylesheet" href="../_static/css/cdap-hide-toc.css" type="text/css" />

    <script type="text/javascript">
      var DOCUMENTATION_OPTIONS = {
        URL_ROOT:    '',
        VERSION:     '6.1.1',
        COLLAPSE_INDEX: false,
        FILE_SUFFIX: '.html',
        HAS_SOURCE:  false
      };
    </script>
    <script type="text/javascript" src="../_static/jquery.js"></script>
    <script type="text/javascript" src="../_static/underscore.js"></script>
    <script type="text/javascript" src="../_static/doctools.js"></script>
    <script type="text/javascript" src="../_static/language_data.js"></script>

    <link rel="shortcut icon" href="../_static/favicon.ico"/>
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
    <link rel="top" title="Cask Data Application Platform 6.1.1 Documentation" href="../index.html" />
    <link rel="up" title="CDAP 入门指南" href="index.html" />
    <link rel="next" title="Example: Building a Stock Selection Pipeline" href="stocks.html" />
    <link rel="prev" title="示例: 使用客户地址信息分发市场推广材料" href="campaign.html" />
    <!-- block extrahead -->
    <meta charset='utf-8'>
    <meta http-equiv='X-UA-Compatible' content='IE=edge,chrome=1'>
    <meta name='viewport' content='width=device-width, initial-scale=1.0, maximum-scale=1'>
    <meta name="apple-mobile-web-app-capable" content="yes">
    <!-- block extrahead end -->

</head>
<body role="document">

<!-- block navbar -->
<div id="navbar" class="navbar navbar-inverse navbar-default navbar-fixed-top">
    <div class="container-fluid">
      <div class="row">
        <div class="navbar-header">
          <!-- .btn-navbar is used as the toggle for collapsed navbar content -->
          <a class="navbar-brand" href="../table-of-contents/../../index.html">
            <span><img alt="CDAP logo" src="../_static/cdap_logo.svg"/></span>
          </a>

          <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".nav-collapse">
            <span class="icon-bar"></span>
            <span class="icon-bar"></span>
            <span class="icon-bar"></span>
          </button>

          <div class="pull-right">
            <div class="dropdown version-dropdown">
              <a href="#" class="dropdown-toggle" data-toggle="dropdown"
                role="button" aria-haspopup="true" aria-expanded="false">
                v 6.1.1 <span class="caret"></span>
              </a>
              <ul class="dropdown-menu">
                <li><a href="//docs.cdap.io/cdap/5.1.2/en/index.html">v 5.1.2</a></li>
                <li><a href="//docs.cdap.io/cdap/4.3.4/en/index.html">v 4.3.4</a></li>
              </ul>
            </div>
          </div>
          <form class="navbar-form navbar-right navbar-search" action="../search.html" method="get">
            <div class="form-group">
              <div class="navbar-search-image material-icons"></div>
              <input type="text" name="q" class="form-control" placeholder="  Search" />
            </div>
            <input type="hidden" name="check_keywords" value="yes" />
            <input type="hidden" name="area" value="default" />
          </form>

          <div class="collapse navbar-collapse nav-collapse navbar-right navbar-navigation">
            <ul class="nav navbar-nav"><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link " href="../table-of-contents/../../index.html">简介</a></li><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link current" href="../table-of-contents/../../guides.html">手册</a></li><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link " href="../table-of-contents/../../reference-manual/index.html">参考</a></li><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link " href="../table-of-contents/../../faqs/index.html">帮助</a></li>
            </ul>
          </div>

        </div>
      </div>
    </div>
  </div><!-- block navbar end -->
<!-- block main content -->
<div class="main-container container">
  <div class="row"><div class="col-md-2">
      <div id="sidebar" class="bs-sidenav scrollable-y-outside" role="complementary">
<!-- theme_manual: user-guide -->
<!-- theme_manual_highlight: guides -->
<!-- sidebar_title_link: ../table-of-contents/../../guides.html -->

  <div role="note" aria-label="manuals links"><h3><a href="../table-of-contents/../../guides.html">Guides</a></h3>

    <ul class="this-page-menu">
      <li class="toctree-l1"><b><a href="../table-of-contents/../../user-guide/index.html" rel="nofollow">用户手册</a></b>
      <nav class="pagenav">
      <ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../index.html"> 简介</a></li>
<li class="toctree-l1"><a class="reference internal" href="../overview.html"> 概述</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="index.html"> 入门指南</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="campaign.html">MySQL 客户数据</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">纽约时报 XML 数据推送</a></li>
<li class="toctree-l2"><a class="reference internal" href="stocks.html">股票选择</a></li>
<li class="toctree-l2"><a class="reference internal" href="fitbit.html">物联网 IoT 设备数据</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../data-preparation/index.html"> 数据预处理</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../data-preparation/concepts.html">      概念</a></li>
<li class="toctree-l2"><a class="reference internal" href="../data-preparation/directives/index.html">      数据处理指令</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/catalog-lookup.html">catalog-lookup</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/change-column-case.html">change-column-case</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/changing-case.html">changing-case</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/cleanse-column-names.html">cleanse-column-names</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/columns-replace.html">columns-replace</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/copy.html">copy</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/cut-character.html">cut-character</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/decode.html">decode</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/diff-date.html">diff-date</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/drop.html">drop</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/encode.html">encode</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/extract-regex-groups.html">extract-regex-groups</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/fail.html">fail</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/fill-null-or-empty.html">fill-null-or-empty</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/filter-row-if-matched.html">filter-row-if-matched</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/filter-row-if-true.html">filter-row-if-true</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/filter-rows-on.html">filter-rows-on</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/find-and-replace.html">find-and-replace</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/flatten.html">flatten</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/format-date.html">format-date</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/format-unix-timestamp.html">format-unix-timestamp</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/generate-uuid.html">generate-uuid</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/hash.html">hash</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/increment-variable.html">increment-variable</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/index-split.html">index-split</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/invoke-http.html">invoke-http</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/json-path.html">json-path</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/keep.html">keep</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/mask-number.html">mask-number</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/mask-shuffle.html">mask-shuffle</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/merge.html">merge</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-as-avro-file.html">parse-as-avro-file</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-as-avro.html">parse-as-avro</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-as-csv.html">parse-as-csv</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-as-date.html">parse-as-date</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-as-excel.html">parse-as-excel</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-as-fixed-length.html">parse-as-fixed-length</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-as-hl7.html">parse-as-hl7</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-as-json.html">parse-as-json</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-as-log.html">parse-as-log</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-as-simple-date.html">parse-as-simple-date</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-as-xml.html">parse-as-xml</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-timestamp.html">parse-timestamp</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/parse-xml-to-json.html">parse-xml-to-json</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/quantize.html">quantize</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/rename.html">rename</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/send-to-error.html">send-to-error</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/set-charset.html">set-charset</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/set-column.html">set-column</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/set-columns.html">set-columns</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/set-record-delim.html">set-record-delim</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/set-type.html">set-type</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/set-variable.html">set-variable</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/split-by-separator.html">split-by-separator</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/split-email.html">split-email</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/split-to-columns.html">split-to-columns</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/split-to-rows.html">split-to-rows</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/split-url.html">split-url</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/stemming.html">stemming</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/swap.html">swap</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/table-lookup.html">table-lookup</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/text-distance.html">text-distance</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/text-metric.html">text-metric</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/trim.html">trim</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/url-decode.html">url-decode</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/url-encode.html">url-encode</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/write-as-csv.html">write-as-csv</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/write-as-json-map.html">write-as-json-map</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/write-as-json-object.html">write-as-json-object</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/directives/xpath.html">xpath</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../data-preparation/functions/index.html">      函数</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/functions/json-functions.html">JSON 函数</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/functions/type-functions.html">类型函数</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/functions/geofence-functions.html">地理围栏函数</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/functions/dq-functions.html">数据质量函数</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/functions/date-functions.html">日期函数</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/functions/ddl-functions.html">DDL 函数</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../data-preparation/service/index.html">      服务</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/service/admin.html">行政和管理服务</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/service/connection-properties.html">连接属性</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/service/connections.html">连接服务</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/service/execution.html">数据处理指令执行</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/service/request.html">请求格式规范</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/service/schema-registry.html">Schema 注册库</a></li>
<li class="toctree-l3"><a class="reference internal" href="../data-preparation/service/services.html">数据预处理服务</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../data-preparation/performance.html">性能</a></li>
<li class="toctree-l2"><a class="reference internal" href="../data-preparation/exclusion-and-aliasing.html">排除与别名</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../pipelines/index.html"> 数据流管道</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/concepts-design.html"> 概念与设计</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/getting-started.html"> 入门指南</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/studio.html"> CDAP 数据流设计器</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/creating-pipelines.html"> 创建数据流管道</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/running-pipelines.html"> 运行数据流管道</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/plugin-management.html"> 插件管理</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pipelines/plugins/index.html"> 插件参考</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/actions/index.html"> Action Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/sources/index.html"> Source Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/transforms/index.html"> Transform Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/analytics/index.html"> Analytic Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/sinks/index.html"> Sink Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/shared-plugins/index.html"> Shared Plugins</a><ul>
<li class="toctree-l4"><a class="reference internal" href="../pipelines/plugins/shared-plugins/core.html">CoreValidator</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../pipelines/plugins/post-run-plugins/index.html"> Post-run Plugins</a><ul class="simple">
</ul>
</li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../mmds/index.html"> 数据分析</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../mmds/concepts.html"> Concepts</a></li>
<li class="toctree-l2"><a class="reference internal" href="../mmds/feature-gen.html"> Feature Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="../mmds/modeling.html"> Modeling</a></li>
<li class="toctree-l2"><a class="reference internal" href="../mmds/example.html"> Example</a></li>
</ul>
</li>
</ul>
</nav>
      </li>
      <li class="toctree-l1"><a href="../table-of-contents/../../developer-manual/index.html" rel="nofollow">开发手册</a>
      </li>
      <li class="toctree-l1"><a href="../table-of-contents/../../admin-manual/index.html" rel="nofollow">管理手册</a>
      </li>
      <li class="toctree-l1"><a href="../table-of-contents/../../integrations/index.html" rel="nofollow">集成手册</a>
      </li>
      <li class="toctree-l1"><a href="../table-of-contents/../../examples-manual/index.html" rel="nofollow">最佳实践</a>
      </li>
    </ul>
  </div></div>
    </div><div class="col-md-8 content" id="main-content">
    
  <div class="section" id="xml">
<span id="tutorials-nytimes"></span><h1>示例: 使用纽约时报 XML 数据推送<a class="headerlink" href="#xml" title="Permalink to this headline">🔗</a></h1>
<div class="section" id="id1">
<h2>简介<a class="headerlink" href="#id1" title="Permalink to this headline">🔗</a></h2>
<p>本教程演示了如何使用 CDAP 的数据预处理和数据流管道来从《纽约时报》 XML 数据推送中提取并提供关键业务信息.</p>
</div>
<div class="section" id="id2">
<h2>场景<a class="headerlink" href="#id2" title="Permalink to this headline">🔗</a></h2>
<p>您的组织是一家在多个大洲经营业务的跨国企业,
对监视来自许多不同新闻站点(例如, 纽约时报) 推送的 RSS XML 新闻提要数据感兴趣.
您想将来自不同地区的故事及时传递给相关分析人员.</p>
<ul class="simple">
<li>您希望将有关巴西的故事写入数据集 (通过网络 web 应用显示) 给拉丁美洲的分析人员</li>
<li>您希望将有关俄罗斯的故事写入数据集 (通过网络 web 应用显示) 给亚洲分析人员</li>
</ul>
</div>
<div class="section" id="id3">
<h2>数据<a class="headerlink" href="#id3" title="Permalink to this headline">🔗</a></h2>
<p>单击下面的按钮以下载包含完成本教程所需数据的`.xml` 文件.</p>
<p><a class="reference download internal" download="" href="../_downloads/ea2b31e2d501091c0f846fec381ec7f5/nytimes-world.xml"><code class="xref download docutils literal notranslate"><span class="pre">nytimes-world.xml</span></code></a></p>
</div>
<div class="section" id="id4">
<h2>视频教程<a class="headerlink" href="#id4" title="Permalink to this headline">🔗</a></h2>
<p>(暂未提供)
..  youtube:: e-5K4cxwGrc</p>
</div>
<div class="section" id="id5">
<h2>操作步骤<a class="headerlink" href="#id5" title="Permalink to this headline">🔗</a></h2>
<div class="section" id="id6">
<h3>加载数据<a class="headerlink" href="#id6" title="Permalink to this headline">🔗</a></h3>
<p>首先, 从上面的 <cite>数据</cite> 部分下载文件.</p>
<p>开始之前, 请从 CDAP 主页导航到数据预处理选项卡. 在数据预处理中, 选择左侧的箭头. 从 <cite>文件系统</cite> 上传 <cite>nytimes-world.xml.</cite></p>
</div>
<div class="section" id="id7">
<h3>处理 XML 数据<a class="headerlink" href="#id7" title="Permalink to this headline">🔗</a></h3>
<p>您应该在 <cite>body</cite> 列中看到一行 XML 数据. 从 <cite>body</cite> 列的下拉菜单中, 选择 解析 &gt; 从 XML 到 JSON.
应用转换 (深度为 “1”).</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/xmltojson.jpeg"><img alt="../_images/xmltojson.jpeg" class="bordered-image" src="../_images/xmltojson.jpeg" style="width: 500px;" /></a>
</div>
<p>在屏幕的右侧, 有两个选项卡用于控制数据选择和数据处理指令: <cite>数据列</cite> 和 <cite>数据处理指令.</cite> 选择 <cite>数据列</cite> 表, 然后选中 <cite>body_rss_channel_item.</cite></p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/keep.jpeg"><img alt="../_images/keep.jpeg" class="bordered-image" src="../_images/keep.jpeg" style="width: 400px;" /></a>
</div>
<p>您将看到相应的数据列已高亮显示. 使用列名旁边的插入符号图标, 选择 <cite>保留选中列.</cite></p>
<p><cite>保留选中列</cite> 是一个有用的数据处理指令, 因为它允许您通过仅选择要保留的小部分列然后就可以删除大量不需要的列.</p>
<p>所有其他列应该都被删除了. 现在, 在 <cite>body_rss_channel_item</cite> 上选择插入符, 并应用 深度为 1 的 解析 &gt; JSON.</p>
<p>使用这个指令多操作几次. 你会看到总共 16 数据列出现了. 每行包含一个纽约时报故事的描述.</p>
<p>最后, 您将删除所有不包含对我们有用的信息的列. 在 <cite>数据列</cite> 选项卡下, 选择:</p>
<ul class="simple">
<li><cite>body_rss_channel_item_link</cite></li>
<li><cite>body_rss_channel_item_dc:creator</cite></li>
<li><cite>body_rss_channel_item_title</cite></li>
<li><cite>body_rss_channel_item_category</cite></li>
<li><cite>body_rss_channel_item_pubDate</cite></li>
</ul>
<p>从这些列之一选择下拉记号, 然后选择 <cite>保留选中列</cite> 选项.</p>
<p>将上面的列 (按顺序) 重命名为:</p>
<ul class="simple">
<li><cite>link</cite></li>
<li><cite>creator</cite></li>
<li><cite>title</cite></li>
<li><cite>category</cite></li>
<li><cite>pubDate</cite></li>
</ul>
<p>您可以通过单击名称以使其可编辑来更改列名称.</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/prepared_data.jpeg"><img alt="../_images/prepared_data.jpeg" class="bordered-image" src="../_images/prepared_data.jpeg" style="width: 800px;" /></a>
</div>
</div>
<div class="section" id="id8">
<h3>从类别数据中提取国家信息<a class="headerlink" href="#id8" title="Permalink to this headline">🔗</a></h3>
<p>您的目标是将有关俄罗斯的故事发送到由亚洲团队监视的数据库,
以及将有关巴西的故事发送到由拉丁美洲团队监视的数据库.</p>
<p>检查 <cite>category</cite> 列, 您将看到 JSON 对象包含以下格式的 URL:</p>
<p><code class="docutils literal notranslate"><span class="pre">http://www.nytimes.com/namespaces/keywords/nyt_geo</span></code></p>
<p>使用 <code class="docutils literal notranslate"><span class="pre">nyt_geo</span></code> 标签, 您可以过滤出感兴趣的地区.</p>
<p>现在, 您想解析此 JSON 以便可以检索感兴趣的区域. 从 <cite>category</cite> 列的插入符号下拉选项中,
选择 解析 &gt; JSON. 在 <cite>category</cite> 列上重复几次这个操作.</p>
<p>现在，您将有两个新列 <code class="docutils literal notranslate"><span class="pre">category_domain</span></code> 和 <code class="docutils literal notranslate"><span class="pre">category_content</span></code>.</p>
<p>从 <code class="docutils literal notranslate"><span class="pre">category_domain</span></code> 的下拉菜单中, 选择 过滤器 &gt; 保留数据行 &gt; 如果值包含. 指定包含值为 <cite>nyt_geo</cite>. 点击 <cite>应用</cite>.</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/prepared_data.jpeg"><img alt="../_images/prepared_data.jpeg" class="bordered-image" src="../_images/prepared_data.jpeg" style="width: 800px;" /></a>
</div>
<p>将仅保留标记有地理类别的故事. 由于您已将 <cite>category_domain</cite> 列过滤为单个值,
你可以继续进行操作, 通过从下来菜单选择 <cite>删除数据列</cite> 删除它.</p>
<p>在教程结尾创建的数据流管道中，您将把俄罗斯故事和巴西故事引导到不同的数据库.</p>
</div>
<div class="section" id="url">
<h3>清理 URL<a class="headerlink" href="#url" title="Permalink to this headline">🔗</a></h3>
<p>清理, 您会看到 <cite>link</cite> 列是如下格式</p>
<p><cite>http://www.nytimes.com/2016/09/24/world/asia/chinese-medicine-paul-unschuld.html?partner=rss&amp;emc=rss</cite></p>
<p>您希望清理 <cite>partner=rss&amp;emc=rss</cite> 后缀. 另外, 您只关心相对路径. 例如, 上面的 URL 将变为:</p>
<p><cite>/2016/09/24/world/asia/chinese-medicine-paul-unschuld.html</cite></p>
<p>您可以使用 <cite>提取字段</cite> 功能. <cite>提取字段</cite> 提供了一套功能强大的工具, 用于自动分析数据, 例如 URL, e-mail, SSN, 等等.</p>
<p>从 <cite>link</cite> 的下拉菜单, 选择 提取字段 &gt; 使用模式.</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/pattern.jpeg"><img alt="../_images/pattern.jpeg" class="bordered-image" src="../_images/pattern.jpeg" style="width: 500px;" /></a>
</div>
<p>在 <cite>使用模式提取字段</cite> 菜单, 选择 <cite>Start/End</cite> 模式. 指定开始模式为 <cite>http://www.nytimes.com</cite>, 结束模式为 <cite>?partner=rss&amp;emc=rss</cite>.</p>
<p>单击 <cite>提取</cite>. 您可以删除 <cite>link</cite> 列并重命名 <cite>link_1_1</cite> 为 <cite>link</cite>.</p>
</div>
<div class="section" id="id9">
<h3>格式化日期<a class="headerlink" href="#id9" title="Permalink to this headline">🔗</a></h3>
<p>您想将 <cite>pubDate</cite> 列, 当前是一个字符串, 转换为 <cite>日期</cite> 对象. 从 <cite>pubDate</cite> 列, 选择 解析 &gt; 自然日期. 时区为 <cite>GMT</cite>.</p>
<p>应用此数据处理指令后, 将看到一个名为 <cite>pubDate_1</cite> 的新列, 其中是一个日期对象.
您可以删除 <cite>pubDate</cite> 列并将新列重命名为 <cite>pubDate</cite>.</p>
</div>
<div class="section" id="id10">
<h3>清理作者姓名<a class="headerlink" href="#id10" title="Permalink to this headline">🔗</a></h3>
<p>最后, 所有作者名称都是大写的, 例如 <cite>KIRK SEMPLE</cite>. 您希望以更专业的格式来提供, 也就是,  <cite>Kirk Semple</cite>.</p>
<p>从 <cite>creator</cite> 列的下拉菜单. 用于 格式化 &gt; 修改为 首字母大小.
您将看到所有作者姓名现已转换为正确的大小写了.</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/titlecase.jpeg"><img alt="../_images/titlecase.jpeg" class="bordered-image" src="../_images/titlecase.jpeg" style="width: 500px;" /></a>
</div>
</div>
<div class="section" id="id11">
<h3>创建数据流管道<a class="headerlink" href="#id11" title="Permalink to this headline">🔗</a></h3>
<p>现在，您已经准备好数据，您可以创建一个数据流管道, 将记录发送到巴西/俄罗斯数据库.</p>
<p>首先, 点击 <cite>创建数据流管道</cite>. 选择 <cite>批处理数据流</cite>.</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/create_pipeline.jpeg"><img alt="../_images/create_pipeline.jpeg" class="bordered-image" src="../_images/create_pipeline.jpeg" style="width: 300px;" /></a>
</div>
<p>您将看到以下数据流管道出现在屏幕上.</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/new_pipeline.jpeg"><img alt="../_images/new_pipeline.jpeg" class="bordered-image" src="../_images/new_pipeline.jpeg" style="width: 500px;" /></a>
</div>
<p>向画布添加两个 <cite>Avro 时间分区数据集</cite> 接收器, 以及两个 <cite>Python 执行器</cite> (从 <cite>数据转换</cite> 菜单).</p>
<p>按照以下格式排列在画布上.</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/format.jpeg"><img alt="../_images/format.jpeg" class="bordered-image" src="../_images/format.jpeg" style="width: 500px;" /></a>
</div>
<p>您可以通过单击 <cite>对齐</cite> 按钮, 它是顶部数第四个按钮.</p>
<p>打开上面的 <cite>Python 执行器</cite> 阶段配置. 用以下代码段替换预填充的代码:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">record</span><span class="p">,</span> <span class="n">emitter</span><span class="p">,</span> <span class="n">context</span><span class="p">):</span>
  <span class="k">if</span> <span class="p">(</span><span class="n">record</span><span class="p">[</span><span class="s1">&#39;category_content&#39;</span><span class="p">]</span> <span class="o">==</span> <span class="s1">&#39;Brazil&#39;</span><span class="p">):</span>
    <span class="n">emitter</span><span class="o">.</span><span class="n">emit</span><span class="p">(</span><span class="n">record</span><span class="p">)</span>
</pre></div>
</div>
<p>此代码仅将标记为 ‘Brazil’ 的记录传递到接收器.</p>
<p>将 <cite>名称</cite> 设置为 <cite>BrazilEvaluator</cite> 然后退出设置窗口.</p>
<p>现在, 打开 <cite>BrazilEvaluator</cite> 关联的接收器的配置. 将 <cite>名称</cite> 和 <cite>数据集名称</cite> 都修改为 <cite>BrazilSink</cite>.</p>
<p>对数据流管道的其他分支重复以上过程, 只是将 <cite>Brazil</cite> 替换为 <cite>Russia</cite>.</p>
<p>通过单击页面顶部的 <cite>命名你的数据流管道</cite> 设置数据流管道名称为 <cite>NewYorkTimesPipeline</cite>.</p>
<p>您的数据流管道现在应如下图所示.</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/final.jpeg"><img alt="../_images/final.jpeg" class="bordered-image" src="../_images/final.jpeg" style="width: 500px;" /></a>
</div>
<p>部署数据流管道后, 就不能再编辑阶段或设置. 因此, 您想要确保在部署数据流管道之前按预期工作.</p>
<p>请点击 <cite>预览</cite>:</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/preview.jpeg"><img alt="../_images/preview.jpeg" class="bordered-image" src="../_images/preview.jpeg" style="width: 500px;" /></a>
</div>
<p>当您单击 <cite>运行</cite>, 您将看到计时器将开始跟踪数据流管道运行了多长时间.完成后, 您可以查看通过每个阶段的记录.
例如, 单击 <cite>BrazilEvaluator</cite>, 您可以看到只有与 <cite>Brazil</cite> 匹配的记录才通过:</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/brazil.jpeg"><img alt="../_images/brazil.jpeg" class="bordered-image" src="../_images/brazil.jpeg" style="width: 500px;" /></a>
</div>
<p>现在, 您可以单击 <cite>部署</cite> 并运行已部署的数据流管道.</p>
</div>
<div class="section" id="id12">
<h3>查询结果<a class="headerlink" href="#id12" title="Permalink to this headline">🔗</a></h3>
<p>您可以单击 <cite>BrazilSink</cite> 查看配置, 然后选择右上角的 <cite>查看详细</cite>.</p>
<p>数据集页面打开后, 单击右上角的眼睛图标. 单击出现的窗口上的 <cite>执行</cite>. 查询执行时, 单击眼睛图标以查看结果的子集.</p>
<div class="figure align-center" style="width: 100%">
<a class="bordered-image reference internal image-reference" href="../_images/brazil_results.jpeg"><img alt="../_images/brazil_results.jpeg" class="bordered-image" src="../_images/brazil_results.jpeg" style="width: 500px;" /></a>
</div>
<p>恭喜你! 你已经清洗了纽约时报的 XML 数据, 并根据地理区域对其进行了拆分, 然后写入了两个数据库.</p>
</div>
</div>
</div>

</div>
    <div class="col-md-2">
      <div id="right-sidebar" class="bs-sidenav scrollable-y" role="complementary">
        <div id="localtoc-scrollspy">
        </div>
      </div>
    </div></div>
</div>
<!-- block main content end -->
<!-- block footer -->
<footer class="footer">
      <div class="container">
        <div class="row">
          <div class="col-md-2 footer-left"><a title="示例: 使用客户地址信息分发市场推广材料" href="campaign.html" />Previous</a></div>
          <div class="col-md-8 footer-center"><a class="footer-tab-link" href="../table-of-contents/../../reference-manual/licenses/index.html">Copyright</a> &copy; 2014-2020 Cask Data, Inc.&bull; <a class="footer-tab-link" href="//docs.cask.co/cdap/6.1.1/cdap-docs-6.1.1-web.zip" rel="nofollow">Download</a> an archive or
<a class="footer-tab-link" href="//docs.cask.co/cdap">switch the version</a> of the documentation
          </div>
          <div class="col-md-2 footer-right"><a title="Example: Building a Stock Selection Pipeline" href="stocks.html" />Next</a></div>
        </div>
      </div>
    </footer>
<!-- block footer end -->
<script type="text/javascript" src="../_static/bootstrap-3.3.6/js/bootstrap.min.js"></script><script type="text/javascript" src="../_static/js/bootstrap-sphinx.js"></script><script type="text/javascript" src="../_static/js/abixTreeList-2.js"></script><script type="text/javascript" src="../_static/js/cdap-dynamicscrollspy-4.js"></script><script type="text/javascript" src="../_static/js/cdap-version-menu.js"></script><script type="text/javascript" src="../_static/js/copy-to-clipboard.js"></script><script type="text/javascript" src="../_static/js/jquery.mousewheel.min.js"></script><script type="text/javascript" src="../_static/js/jquery.mCustomScrollbar.js"></script><script type="text/javascript" src="../_static/js/js.cookie.js"></script><script type="text/javascript" src="../_static/js/tabbed-parsed-literal-0.2.js"></script><script type="text/javascript" src="../_static/js/cdap-onload-javascript.js"></script><script type="text/javascript" src="../_static/js/cdap-version-menu.js"></script>
    <script src="https://cdap.gitee.io/docs/cdap/json-versions.js"/></script>
  </body>
</html>